# install.packages("ggplot2","dplyr")
library(ggplot2)
library(dplyr)
library(tidyr)
# R must be at least 3.3.1 for `tm` and `slam` to work.
# install.packages("tm")
# install.packages("SnowballC")
library(tm)
Loading required package: NLP
Attaching package: ‘NLP’
The following object is masked from ‘package:ggplot2’:
annotate
#system("ls ../input") # do we need this?
Adapted from this Kaggle notebook.
shak<-read.csv("../data/Shakespeare_data.csv",header = TRUE, as.is = TRUE)
#shak<-na.omit(shak)
head(shak)
Dataline Play PlayerLinenumber ActSceneLine Player
1 1 Henry IV NA
2 2 Henry IV NA
3 3 Henry IV NA
4 4 Henry IV 1 1.1.1 KING HENRY IV
5 5 Henry IV 1 1.1.2 KING HENRY IV
6 6 Henry IV 1 1.1.3 KING HENRY IV
PlayerLine
1 ACT I
2 SCENE I. London. The palace.
3 Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others
4 So shaken as we are, so wan with care,
5 Find we a time for frighted peace to pant,
6 And breathe short-winded accents of new broils
# play level word frequency
plays <- unique(shak$Play)
loveFreq<-numeric()
for (i in 1:length(plays)){
text <- Corpus(VectorSource(paste(shak[shak$Play==plays[i],]$PlayerLine,collapse=" ")))
text <- tm_map(text, removePunctuation)
text <- tm_map(text, PlainTextDocument)
text <- tm_map(text, removeWords, stopwords('english'))
# stemming to merge all "loved", "loving" into one
text <- tm_map(text, stemDocument)
tdm <- TermDocumentMatrix(text)
loveFreq[i]<-as.numeric(slam::row_sums(tdm)["love"])
}
lPlay <- data.frame(plays,loveFreq)
lPlay <- na.omit(lPlay)
# order the plays based on the occurence of love
lPlay<-lPlay[order(-lPlay$loveFreq),]
head(lPlay)
plays loveFreq
35 Two Gentlemen of Verona 188
28 Romeo and Juliet 160
6 As you like it 138
22 A Midsummer nights dream 128
17 Loves Labours Lost 125
23 Much Ado about nothing 122
# player level word frequency
players <- unique(shak$Player)
loveFreq <- numeric()
for (i in 1:length(players)){
text <- Corpus(VectorSource(paste(shak[shak$Player==players[i],]$PlayerLine,collapse=" ")))
text <- tm_map(text, removePunctuation)
text <- tm_map(text, PlainTextDocument)
text <- tm_map(text, removeWords, stopwords('english'))
text <- tm_map(text,stemDocument)
tdm <- TermDocumentMatrix(text)
loveFreq[i] <- as.numeric(slam::row_sums(tdm)["love"])
}
lPlayer <- data.frame(players,loveFreq)
lPlayer <- na.omit(lPlayer)
#order
lPlayer <- lPlayer[order(-lPlayer$loveFreq),]
head(lPlayer)
players loveFreq
904 PROTEUS 59
190 ROSALIND 57
771 ROMEO 56
169 HELENA 46
906 JULIA 41
650 IAGO 40
shak %>%
group_by(Play) %>%
summarise(n = n()) %>%
ggplot(., aes(x=reorder(Play, n),y=n)) +
geom_bar(stat="identity") +
coord_flip() +
ggtitle("Length of Shakespeare's plays") +
theme(legend.position="none") +
xlab("Play") +
ylab("Number of lines")
shak %>%
filter(Play == "Hamlet") %>%
group_by(Player) %>%
summarise(n = n()) %>%
ggplot(., aes(x=reorder(Player, n),y=n)) +
geom_bar(stat="identity") +
coord_flip() +
ggtitle("Speech in Hamlet") +
theme(legend.position="none") +
xlab("Player") +
ylab("Number of lines")
shak %>%
group_by(Play,Player) %>%
summarise(n = n()) %>%
filter(n > 700) %>%
ggplot(., aes(x=reorder(Player, n),y=n)) +
geom_bar(aes(fill=Play),stat="identity") +
coord_flip() +
ggtitle("Amount of lines by character") +
# theme(legend.position="none") +
xlab("Player") +
ylab("Number of lines")
lPlay %>%
ggplot(., aes(x=reorder(plays, loveFreq),y=loveFreq)) +
geom_bar(aes(fill=plays),stat="identity") +
coord_flip() +
ggtitle("Love in each play") +
# theme(legend.position="none") +
xlab("Play") +
ylab("frequency of the word 'love'") +
theme(legend.position = "none")
lPlayer %>%
filter(loveFreq > 20) %>%
ggplot(., aes(x=reorder(players, loveFreq),y=loveFreq)) +
geom_bar(aes(fill=players),stat="identity") +
coord_flip() +
ggtitle("Love in each play") +
# theme(legend.position="none") +
xlab("Play") +
ylab("frequency of the word 'love'") +
theme(legend.position = "none")
library(dplyr)
#install.packages("tidytext")
library(tidytext)
shak %>%
as_tibble(.) %>%
unnest_tokens(tbl=., input = PlayerLine, output = word)
[38;5;246m# A tibble: 820,204 x 6[39m
Dataline Play PlayerLinenumber ActSceneLine Player word
[3m[38;5;246m<int>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m
[38;5;250m 1[39m 1 Henry IV [31mNA[39m [38;5;246m"[39m[38;5;246m"[39m [38;5;246m"[39m[38;5;246m"[39m act
[38;5;250m 2[39m 1 Henry IV [31mNA[39m [38;5;246m"[39m[38;5;246m"[39m [38;5;246m"[39m[38;5;246m"[39m i
[38;5;250m 3[39m 2 Henry IV [31mNA[39m [38;5;246m"[39m[38;5;246m"[39m [38;5;246m"[39m[38;5;246m"[39m scene
[38;5;250m 4[39m 2 Henry IV [31mNA[39m [38;5;246m"[39m[38;5;246m"[39m [38;5;246m"[39m[38;5;246m"[39m i
[38;5;250m 5[39m 2 Henry IV [31mNA[39m [38;5;246m"[39m[38;5;246m"[39m [38;5;246m"[39m[38;5;246m"[39m london
[38;5;250m 6[39m 2 Henry IV [31mNA[39m [38;5;246m"[39m[38;5;246m"[39m [38;5;246m"[39m[38;5;246m"[39m the
[38;5;250m 7[39m 2 Henry IV [31mNA[39m [38;5;246m"[39m[38;5;246m"[39m [38;5;246m"[39m[38;5;246m"[39m palace
[38;5;250m 8[39m 3 Henry IV [31mNA[39m [38;5;246m"[39m[38;5;246m"[39m [38;5;246m"[39m[38;5;246m"[39m enter
[38;5;250m 9[39m 3 Henry IV [31mNA[39m [38;5;246m"[39m[38;5;246m"[39m [38;5;246m"[39m[38;5;246m"[39m king
[38;5;250m10[39m 3 Henry IV [31mNA[39m [38;5;246m"[39m[38;5;246m"[39m [38;5;246m"[39m[38;5;246m"[39m henry
[38;5;246m# ... with 820,194 more rows[39m
shak %>%
as_tibble(.) %>%
unnest_tokens(tbl=., input = PlayerLine, output = word) %>%
count(word, sort = TRUE)
[38;5;246m# A tibble: 24,749 x 2[39m
word n
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m the [4m2[24m[4m7[24m052
[38;5;250m 2[39m and [4m2[24m[4m5[24m082
[38;5;250m 3[39m i [4m2[24m[4m0[24m142
[38;5;250m 4[39m to [4m1[24m[4m8[24m984
[38;5;250m 5[39m of [4m1[24m[4m5[24m862
[38;5;250m 6[39m a [4m1[24m[4m4[24m196
[38;5;250m 7[39m you [4m1[24m[4m3[24m347
[38;5;250m 8[39m my [4m1[24m[4m1[24m875
[38;5;250m 9[39m in [4m1[24m[4m0[24m540
[38;5;250m10[39m that [4m1[24m[4m0[24m441
[38;5;246m# ... with 24,739 more rows[39m
shak %>%
as_tibble(.) %>%
unnest_tokens(tbl=., input = PlayerLine, output = word) %>%
anti_join(stop_words) %>%
count(word, sort = TRUE)
Joining, by = "word"
[38;5;246m# A tibble: 24,148 x 2[39m
word n
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m thou [4m5[24m193
[38;5;250m 2[39m thy [4m3[24m727
[38;5;250m 3[39m thee [4m3[24m024
[38;5;250m 4[39m lord [4m2[24m621
[38;5;250m 5[39m sir [4m2[24m454
[38;5;250m 6[39m enter [4m2[24m338
[38;5;250m 7[39m love [4m1[24m927
[38;5;250m 8[39m hath [4m1[24m845
[38;5;250m 9[39m king [4m1[24m500
[38;5;250m10[39m tis [4m1[24m384
[38;5;246m# ... with 24,138 more rows[39m
shak %>%
as_tibble(.) %>%
unnest_tokens(tbl=., input = PlayerLine, output = word) %>%
anti_join(stop_words) %>%
count(word, sort = TRUE) %>%
filter(n>800) %>%
ggplot(., aes(x=reorder(word,n),y=n)) +
geom_bar(stat="identity") +
coord_flip()
Joining, by = "word"
How can we organise this so that we can compare across plays?
shak[,c(2,5,6)] %>%
as_tibble() %>%
unnest_tokens(tbl=., input = PlayerLine, output = word) %>%
filter(word=="love" | word =="king" | word=="death" | word=="sweet") %>%
#add_count(Player) %>%
group_by(Player,Play,word) %>%
summarise(n=n()) %>%
#anti_join(stop_words) %>%
filter( Play == "Hamlet" |
Play == "King Lear" |
Play == "A Midsummer nights dream" |
Play == "Othello" |
Play == "Henry V" |
Play == "Romeo and Juliet") %>%
arrange(desc(n)) %>%
ggplot(., aes(x=word,y=n)) +
geom_bar(aes(fill=word),stat="identity") +
# coord_flip() +
facet_wrap(~Play)
Is there a way to break it down to see who is saying what?
word <- c(NA,"thou","thee","thy","thine","dost","shalt","wilt","hast","hath","scene","tis","ii","iii","iv","v","vi","vii")
lexicon <- rep("shakespeare",length(word))
new_stop <- cbind(word,lexicon)
shak_stop <- rbind(new_stop,stop_words)
shak %>%
as_tibble() %>%
unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
count(word1, word2, sort = TRUE)
[38;5;246m# A tibble: 57,371 x 3[39m
word1 word2 n
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m enter king 101
[38;5;250m 2[39m mine eyes 95
[38;5;250m 3[39m king henry 88
[38;5;250m 4[39m sir john 80
[38;5;250m 5[39m mark antony 76
[38;5;250m 6[39m mine honour 71
[38;5;250m 7[39m king richard 51
[38;5;250m 8[39m god save 48
[38;5;250m 9[39m gracious lord 46
[38;5;250m10[39m noble lord 46
[38;5;246m# ... with 57,361 more rows[39m
#install.packages("igraph")
#install.packages("ggraph")
library(igraph)
library(ggraph)
library(grid)
set.seed(814)
a <- grid::arrow(type = "closed", angle=22.5, length = unit(.1, "inches"))
p1 <- shak %>%
filter(Play=="Hamlet") %>%
as_tibble() %>%
unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
count(word1, word2, sort = TRUE) %>%
filter(n > 6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), edge_colour="darkblue", show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
theme_void()
set.seed(814)
p2 <- shak %>%
filter(Play == "Twelfth Night") %>%
as_tibble() %>%
unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
count(word1, word2, sort = TRUE) %>%
filter(n > 6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), edge_colour="darkred", show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "salmon", size = 5) +
geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
theme_void()
set.seed(814)
p3 <- shak %>%
filter(Play == "Romeo and Juliet") %>%
as_tibble() %>%
unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
count(word1, word2, sort = TRUE) %>%
filter(n > 6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), edge_colour="darkgreen", show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "green2", size = 5) +
geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
theme_void()
multiplot(p1,p2,p3,cols=3)
set.seed(814)
a <- grid::arrow(type = "closed", angle=22.5, length = unit(.1, "inches"))
p1 <- shak %>%
filter(ActSceneLine != "") %>%
filter(Play=="Hamlet") %>%
as_tibble() %>%
unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
count(word1, word2, sort = TRUE) %>%
filter(n > 3) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), edge_colour="darkblue", show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
theme_void() +
ggtitle("Hamlet")
set.seed(814)
p2 <- shak %>%
filter(ActSceneLine != "") %>%
filter(Play == "Twelfth Night") %>%
as_tibble() %>%
unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
count(word1, word2, sort = TRUE) %>%
filter(n > 3) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), edge_colour="darkred", show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "salmon", size = 5) +
geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
theme_void() +
ggtitle("Twelfth Night")
set.seed(814)
p3 <- shak %>%
filter(ActSceneLine != "") %>%
filter(Play == "Romeo and Juliet") %>%
as_tibble() %>%
unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
count(word1, word2, sort = TRUE) %>%
filter(n > 3) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), edge_colour="darkgreen", show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "green2", size = 5) +
geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
theme_void() +
ggtitle("Romeo and Juliet")
set.seed(814)
p4 <- shak %>%
filter(ActSceneLine != "") %>%
filter(Play == "Othello") %>%
as_tibble() %>%
unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
count(word1, word2, sort = TRUE) %>%
filter(n > 3) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), edge_colour="darkorange", show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "orange", size = 5) +
geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
theme_void() +
ggtitle("Othello")
set.seed(814)
p5 <- shak %>%
filter(ActSceneLine != "") %>%
filter(Play == "Henry IV") %>%
as_tibble() %>%
unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
count(word1, word2, sort = TRUE) %>%
filter(n > 3) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), edge_colour="cadetblue4", show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "cyan", size = 5) +
geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
theme_void() +
ggtitle("Henry IV")
set.seed(814)
p6 <- shak %>%
filter(ActSceneLine != "") %>%
filter(Play == "The Tempest") %>%
as_tibble() %>%
unnest_tokens(input = PlayerLine, output = bigram, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>% # separates bigram into two columns, one for each word
filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
count(word1, word2, sort = TRUE) %>%
filter(n > 3) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), edge_colour="violet", show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "magenta", size = 5) +
geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
theme_void() +
ggtitle("The Tempest")
multiplot(p1,p2,p3,p4,p5,p6,cols=3)
This should give us a better idea of slightly looser connections
set.seed(814)
a <- grid::arrow(type = "closed", angle=22.5, length = unit(.1, "inches"))
shak %>%
as_tibble() %>%
filter(ActSceneLine != "") %>%
unnest_tokens(input = PlayerLine, output = trigram, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% # separates bigram into two columns, one for each word
filter(!word1 %in% shak_stop$word) %>% # filters stop words from first column
filter(!word2 %in% shak_stop$word) %>% # filters stop words from second column
filter(!word3 %in% shak_stop$word) %>% # filters stop words from third column
count(word1, word2, word3, sort = TRUE) %>%
filter(n > 2) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), edge_colour="darkblue", show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel=TRUE) + # , vjust = 1, hjust = 1) +
theme_void()
What happens if we treat the first pair and second pair of trigrams as separate bigrams and graph them as before?
shak %>%
as_tibble() %>%
#filter(Play == "Hamlet" | Play == "Loves Labours Lost" | Play == "A Winters Tale") %>%
filter(ActSceneLine != "") %>%
mutate(ActSceneLine2 = ActSceneLine) %>%
separate(ActSceneLine2, c("act", "scene", "line")) %>%
count(Play,act,scene, sort=TRUE) %>%
transmute(play=Play, act=as.numeric(act), scene=as.numeric(scene), n=n)
[38;5;246m# A tibble: 737 x 4[39m
play act scene n
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m Loves Labours Lost 5 2 972
[38;5;250m 2[39m A Winters Tale 4 4 929
[38;5;250m 3[39m Hamlet 2 2 616
[38;5;250m 4[39m King John 2 1 609
[38;5;250m 5[39m The Tempest 1 2 596
[38;5;250m 6[39m Cymbeline 5 5 584
[38;5;250m 7[39m Measure for measure 5 1 580
[38;5;250m 8[39m Timon of Athens 4 3 577
[38;5;250m 9[39m Richard III 4 4 561
[38;5;250m10[39m A Winters Tale 1 2 539
[38;5;246m# ... with 727 more rows[39m
What this all seems to tell us is that we can visualise the structure of the play, separate from their content. Is this useful to you?